{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import sentencepiece as spm\n",
    "import json\n",
    "from glob import glob\n",
    "import os\n",
    "from tensor2tensor.data_generators import problem\n",
    "from tensor2tensor.data_generators import text_problems\n",
    "from tensor2tensor.utils import registry\n",
    "from tensor2tensor.layers import modalities\n",
    "import tensorflow as tf\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab = 'sp10m.cased.t5.model'\n",
    "sp = spm.SentencePieceProcessor()\n",
    "sp.Load(vocab)\n",
    "\n",
    "\n",
    "class Encoder:\n",
    "    def __init__(self, sp):\n",
    "        self.sp = sp\n",
    "        self.vocab_size = sp.GetPieceSize() + 100\n",
    "\n",
    "    def encode(self, s):\n",
    "        return self.sp.EncodeAsIds(s)\n",
    "\n",
    "    def decode(self, ids, strip_extraneous = False):\n",
    "        return self.sp.DecodeIds(list(ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "d = [\n",
    "    {'class': 0, 'Description': 'PAD', 'salah': '', 'betul': ''},\n",
    "    {\n",
    "        'class': 1,\n",
    "        'Description': 'kesambungan subwords',\n",
    "        'salah': '',\n",
    "        'betul': '',\n",
    "    },\n",
    "    {\n",
    "        'class': 2,\n",
    "        'Description': 'tiada kesalahan',\n",
    "        'salah': '',\n",
    "        'betul': '',\n",
    "    },\n",
    "    {\n",
    "        'class': 3,\n",
    "        'Description': 'kesalahan frasa nama, Perkara yang diterangkan mesti mendahului \"penerang\"',\n",
    "        'salah': 'Cili sos',\n",
    "        'betul': 'sos cili',\n",
    "    },\n",
    "    {\n",
    "        'class': 4,\n",
    "        'Description': 'kesalahan kata jamak',\n",
    "        'salah': 'mereka-mereka',\n",
    "        'betul': 'mereka',\n",
    "    },\n",
    "    {\n",
    "        'class': 5,\n",
    "        'Description': 'kesalahan kata penguat',\n",
    "        'salah': 'sangat tinggi sekali',\n",
    "        'betul': 'sangat tinggi',\n",
    "    },\n",
    "    {\n",
    "        'class': 6,\n",
    "        'Description': 'kata adjektif dan imbuhan \"ter\" tanpa penguat.',\n",
    "        'salah': 'Sani mendapat markah yang tertinggi sekali.',\n",
    "        'betul': 'Sani mendapat markah yang tertinggi.',\n",
    "    },\n",
    "    {\n",
    "        'class': 7,\n",
    "        'Description': 'kesalahan kata hubung',\n",
    "        'salah': 'Sally sedang membaca bila saya tiba di rumahnya.',\n",
    "        'betul': 'Sally sedang membaca apabila saya tiba di rumahnya.',\n",
    "    },\n",
    "    {\n",
    "        'class': 8,\n",
    "        'Description': 'kesalahan kata bilangan',\n",
    "        'salah': 'Beribu peniaga tidak membayar cukai pendapatan.',\n",
    "        'betul': 'Beribu-ribu peniaga tidak membayar cukai pendapatan',\n",
    "    },\n",
    "    {\n",
    "        'class': 9,\n",
    "        'Description': 'kesalahan kata sendi',\n",
    "        'salah': 'Umar telah berpindah daripada sekolah ini bulan lalu.',\n",
    "        'betul': 'Umar telah berpindah dari sekolah ini bulan lalu.',\n",
    "    },\n",
    "    {\n",
    "        'class': 10,\n",
    "        'Description': 'kesalahan penjodoh bilangan',\n",
    "        'salah': 'Setiap orang pelajar',\n",
    "        'betul': 'Setiap pelajar.',\n",
    "    },\n",
    "    {\n",
    "        'class': 11,\n",
    "        'Description': 'kesalahan kata ganti diri',\n",
    "        'salah': 'Pencuri itu telah ditangkap. Beliau dibawa ke balai polis.',\n",
    "        'betul': 'Pencuri itu telah ditangkap. Dia dibawa ke balai polis.',\n",
    "    },\n",
    "    {\n",
    "        'class': 12,\n",
    "        'Description': 'kesalahan ayat pasif',\n",
    "        'salah': 'Cerpen itu telah dikarang oleh saya.',\n",
    "        'betul': 'Cerpen itu telah saya karang.',\n",
    "    },\n",
    "    {\n",
    "        'class': 13,\n",
    "        'Description': 'kesalahan kata tanya',\n",
    "        'salah': 'Kamu berasal dari manakah ?',\n",
    "        'betul': 'Kamu berasal dari mana ?',\n",
    "    },\n",
    "    {\n",
    "        'class': 14,\n",
    "        'Description': 'kesalahan tanda baca',\n",
    "        'salah': 'Kamu berasal dari manakah .',\n",
    "        'betul': 'Kamu berasal dari mana ?',\n",
    "    },\n",
    "    {\n",
    "        'class': 15,\n",
    "        'Description': 'kesalahan kata kerja tak transitif',\n",
    "        'salah': 'Dia kata kepada saya',\n",
    "        'betul': 'Dia berkata kepada saya',\n",
    "    },\n",
    "    {\n",
    "        'class': 16,\n",
    "        'Description': 'kesalahan kata kerja transitif',\n",
    "        'salah': 'Dia suka baca buku',\n",
    "        'betul': 'Dia suka membaca buku',\n",
    "    },\n",
    "    {\n",
    "        'class': 17,\n",
    "        'Description': 'penggunaan kata yang tidak tepat',\n",
    "        'salah': 'Tembuk Besar negeri Cina dibina oleh Shih Huang Ti.',\n",
    "        'betul': 'Tembok Besar negeri Cina dibina oleh Shih Huang Ti',\n",
    "    },\n",
    "]\n",
    "\n",
    "\n",
    "class Tatabahasa:\n",
    "    def __init__(self, d):\n",
    "        self.d = d\n",
    "        self.kesalahan = {i['Description']: no for no, i in enumerate(self.d)}\n",
    "        self.reverse_kesalahan = {v: k for k, v in self.kesalahan.items()}\n",
    "        self.vocab_size = len(self.d)\n",
    "\n",
    "    def encode(self, s):\n",
    "        return [self.kesalahan[i] for i in s]\n",
    "\n",
    "    def decode(self, ids, strip_extraneous = False):\n",
    "        return [self.reverse_kesalahan[i] for i in ids]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_xy(row, encoder):\n",
    "    x, y, tag = [], [], []\n",
    "\n",
    "    for i in range(len(row[0])):\n",
    "        t = encoder.encode(row[0][i][0])\n",
    "        tag.extend([row[1][i][1]] * len(t))\n",
    "        y.extend(t)\n",
    "        t = encoder.encode(row[1][i][0])\n",
    "        x.extend(t)\n",
    "        \n",
    "    # EOS\n",
    "    x.append(1)\n",
    "    y.append(1)\n",
    "    tag.append(0)\n",
    "    \n",
    "    return x, y, tag"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "tags = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "@registry.register_problem\n",
    "class Grammar(text_problems.Text2TextProblem):\n",
    "    \"\"\"grammatical error correction.\"\"\"\n",
    "\n",
    "    def feature_encoders(self, data_dir):\n",
    "        encoder = Encoder(sp)\n",
    "        t = Tatabahasa(d)\n",
    "        return {'inputs': encoder, 'targets': encoder, 'targets_error_tag': t}\n",
    "\n",
    "    def hparams(self, defaults, model_hparams):\n",
    "        super(Grammar, self).hparams(defaults, model_hparams)\n",
    "        if 'use_error_tags' not in model_hparams:\n",
    "            model_hparams.add_hparam('use_error_tags', True)\n",
    "        if 'middle_prediction' not in model_hparams:\n",
    "            model_hparams.add_hparam('middle_prediction', False)\n",
    "        if 'middle_prediction_layer_factor' not in model_hparams:\n",
    "            model_hparams.add_hparam('middle_prediction_layer_factor', 2)\n",
    "        if 'ffn_in_prediction_cascade' not in model_hparams:\n",
    "            model_hparams.add_hparam('ffn_in_prediction_cascade', 1)\n",
    "        if 'error_tag_embed_size' not in model_hparams:\n",
    "            model_hparams.add_hparam('error_tag_embed_size', 12)\n",
    "        if model_hparams.use_error_tags:\n",
    "            defaults.modality[\n",
    "                'targets_error_tag'\n",
    "            ] = modalities.ModalityType.SYMBOL\n",
    "            error_tag_vocab_size = self._encoders[\n",
    "                'targets_error_tag'\n",
    "            ].vocab_size\n",
    "            defaults.vocab_size['targets_error_tag'] = error_tag_vocab_size\n",
    "\n",
    "    def example_reading_spec(self):\n",
    "        data_fields, _ = super(Seq2edits, self).example_reading_spec()\n",
    "        data_fields['targets_error_tag'] = tf.VarLenFeature(tf.int64)\n",
    "        return data_fields, None\n",
    "\n",
    "    @property\n",
    "    def approx_vocab_size(self):\n",
    "        return 32100\n",
    "\n",
    "    @property\n",
    "    def is_generate_per_split(self):\n",
    "        return False\n",
    "\n",
    "    @property\n",
    "    def dataset_splits(self):\n",
    "        return [\n",
    "            {'split': problem.DatasetSplit.TRAIN, 'shards': 200},\n",
    "            {'split': problem.DatasetSplit.EVAL, 'shards': 1},\n",
    "        ]\n",
    "    def generate_samples(self, data_dir, tmp_dir, dataset_split):\n",
    "        \n",
    "        from glob import glob\n",
    "        files = glob('/home/husein/pure-text/*tatabahasa-*.pkl')\n",
    "        encoder = Encoder(sp)\n",
    "        \n",
    "        for file in files:\n",
    "            \n",
    "            with open(file, 'rb') as fopen:\n",
    "                data = pickle.load(fopen)\n",
    "        \n",
    "            for row in tqdm(data):\n",
    "                x, y, tag = get_xy(row, encoder)\n",
    "                if len(y) != len(tag):\n",
    "                    continue\n",
    "                tags.extend(tag)\n",
    "                yield {\n",
    "                    'inputs': x,\n",
    "                    'targets': y,\n",
    "                    'targets_error_tag': tag,\n",
    "                }\n",
    "\n",
    "    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):\n",
    "\n",
    "        generator = self.generate_samples(data_dir, tmp_dir, dataset_split)\n",
    "        for sample in generator:\n",
    "            yield sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import tensorflow as tf\n",
    "\n",
    "os.system('rm -rf t2t-tatabahasa/data')\n",
    "DATA_DIR = os.path.expanduser('t2t-tatabahasa/data')\n",
    "TMP_DIR = os.path.expanduser('t2t-tatabahasa/tmp')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf.gfile.MakeDirs(DATA_DIR)\n",
    "tf.gfile.MakeDirs(TMP_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensor2tensor.utils import registry\n",
    "from tensor2tensor import problems"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/99939 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 0.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 0.\n",
      "100%|██████████| 99939/99939 [00:38<00:00, 2587.49it/s]\n",
      "  0%|          | 0/99898 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 100000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 100000.\n",
      "100%|██████████| 99898/99898 [00:43<00:00, 2299.05it/s]\n",
      "  0%|          | 0/99949 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 200000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 200000.\n",
      "100%|██████████| 99949/99949 [00:46<00:00, 2170.53it/s]\n",
      "  0%|          | 0/37236 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 300000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 300000.\n",
      "100%|██████████| 37236/37236 [00:17<00:00, 2111.24it/s]\n",
      " 63%|██████▎   | 62880/99933 [00:25<00:16, 2293.53it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 400000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 400000.\n",
      "100%|██████████| 99933/99933 [00:39<00:00, 2528.48it/s]\n",
      " 63%|██████▎   | 62952/99938 [00:27<00:17, 2138.41it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 500000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 500000.\n",
      "100%|██████████| 99938/99938 [00:42<00:00, 2341.33it/s]\n",
      " 63%|██████▎   | 62884/99955 [00:23<00:12, 3063.73it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 600000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 600000.\n",
      "100%|██████████| 99955/99955 [00:38<00:00, 2618.91it/s]\n",
      " 63%|██████▎   | 63113/99907 [00:27<00:16, 2192.06it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 700000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 700000.\n",
      "100%|██████████| 99907/99907 [00:44<00:00, 2241.94it/s]\n",
      " 63%|██████▎   | 63198/99956 [00:25<00:15, 2381.59it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 800000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 800000.\n",
      "100%|██████████| 99956/99956 [00:39<00:00, 2501.46it/s]\n",
      " 63%|██████▎   | 63053/99954 [00:24<00:14, 2485.16it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 900000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 900000.\n",
      "100%|██████████| 99954/99954 [00:38<00:00, 2592.45it/s]\n",
      " 63%|██████▎   | 63291/99942 [00:27<00:15, 2305.21it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1000000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1000000.\n",
      "100%|██████████| 99942/99942 [00:43<00:00, 2291.56it/s]\n",
      " 63%|██████▎   | 63048/99898 [00:27<00:12, 3022.30it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1100000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1100000.\n",
      "100%|██████████| 99898/99898 [00:43<00:00, 2306.03it/s]\n",
      " 63%|██████▎   | 63303/99922 [00:26<00:14, 2496.53it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1200000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1200000.\n",
      "100%|██████████| 99922/99922 [00:42<00:00, 2327.32it/s]\n",
      " 63%|██████▎   | 63428/99955 [00:28<00:17, 2082.24it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1300000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1300000.\n",
      "100%|██████████| 99955/99955 [00:43<00:00, 2292.16it/s]\n",
      " 63%|██████▎   | 63453/99935 [00:27<00:15, 2354.98it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1400000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1400000.\n",
      "100%|██████████| 99935/99935 [00:43<00:00, 2300.44it/s]\n",
      " 64%|██████▎   | 63666/99919 [00:28<00:15, 2295.85it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1500000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1500000.\n",
      "100%|██████████| 99919/99919 [00:43<00:00, 2274.73it/s]\n",
      " 64%|██████▎   | 63620/99880 [00:28<00:16, 2218.53it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1600000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1600000.\n",
      "100%|██████████| 99880/99880 [00:44<00:00, 2262.17it/s]\n",
      " 64%|██████▍   | 63797/99968 [00:25<00:16, 2217.92it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1700000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1700000.\n",
      "100%|██████████| 99968/99968 [00:38<00:00, 2566.83it/s]\n",
      " 64%|██████▍   | 63776/99906 [00:27<00:18, 1976.94it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1800000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1800000.\n",
      "100%|██████████| 99906/99906 [00:42<00:00, 2323.89it/s]\n",
      " 64%|██████▍   | 63997/99965 [00:24<00:11, 3077.95it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1900000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1900000.\n",
      "100%|██████████| 99965/99965 [00:37<00:00, 2654.81it/s]\n",
      " 64%|██████▍   | 63844/99926 [00:26<00:16, 2247.96it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2000000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2000000.\n",
      "100%|██████████| 99926/99926 [00:42<00:00, 2357.70it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generated 2035881 Examples\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generated 2035881 Examples\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Shuffling data...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Shuffling data...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensor2tensor-1.15.7-py3.6.egg/tensor2tensor/data_generators/generator_utils.py:477: tf_record_iterator (from tensorflow.python.lib.io.tf_record) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use eager execution and: \n",
      "`tf.data.TFRecordDataset(path)`\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensor2tensor-1.15.7-py3.6.egg/tensor2tensor/data_generators/generator_utils.py:477: tf_record_iterator (from tensorflow.python.lib.io.tf_record) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use eager execution and: \n",
      "`tf.data.TFRecordDataset(path)`\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Data shuffled.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Data shuffled.\n"
     ]
    }
   ],
   "source": [
    "PROBLEM = 'grammar'\n",
    "t2t_problem = problems.problem(PROBLEM)\n",
    "t2t_problem.generate_data(DATA_DIR, TMP_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([ 0,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17]),\n",
       " array([ 2035881, 43535622,  1310695,   295299,    13388,   114688,\n",
       "          251414,    11864,   651255,   132101,   161983,    80888,\n",
       "           11598,  1210119,   175657,    32830,   255391]))"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "np.unique(tags, return_counts = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
